library(tidyverse)
library(readr)
orders <- read_csv("../data/ordersall.csv")
test <- read_csv("../data/booktest.csv")
## Warning: Missing column names filled in: 'X3' [3]
book <- read_csv("../data/book.csv")
train <- read_csv("../data/booktrain.csv")
## Warning: Missing column names filled in: 'X3' [3]
new_book <- book[!is.na(book$logtargamt),]
new_book <- subset(new_book, select = c(id, logtargamt, recency, frequency, amount, tof))
new_book$after <- ifelse(new_book$logtargamt > 0, 1, 0)
new_book$afint <- new_book$amount * new_book$frequency
new_book$rfint <- new_book$recency * new_book$frequency
fit = glm(after ~ amount + recency + frequency + tof, family = binomial, data=new_book)
fit2 = glm(after ~ amount + recency + frequency + tof +afint, family = binomial, data=new_book)
fit3 = glm(after ~ amount + recency + frequency + tof + rfint, family = binomial, data=new_book)
fit4 = glm(after ~ amount + recency + frequency + tof + afint + rfint, family = binomial, data=new_book)
summary(fit)
##
## Call:
## glm(formula = after ~ amount + recency + frequency + tof, family = binomial,
## data = new_book)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7950 -0.3388 -0.2690 -0.1799 3.5454
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.475e+00 9.779e-02 -25.306 < 2e-16 ***
## amount -2.690e-04 2.429e-04 -1.108 0.268073
## recency -1.291e-03 2.210e-04 -5.841 5.19e-09 ***
## frequency 4.342e-02 1.227e-02 3.539 0.000401 ***
## tof -4.302e-04 9.657e-05 -4.455 8.38e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2750.5 on 8310 degrees of freedom
## Residual deviance: 2609.6 on 8306 degrees of freedom
## AIC: 2619.6
##
## Number of Fisher Scoring iterations: 7
summary(fit2)
##
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + afint,
## family = binomial, data = new_book)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6871 -0.3399 -0.2674 -0.1781 3.5372
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.533e+00 1.043e-01 -24.295 < 2e-16 ***
## amount 2.203e-04 3.634e-04 0.606 0.544415
## recency -1.181e-03 2.302e-04 -5.130 2.90e-07 ***
## frequency 5.779e-02 1.573e-02 3.674 0.000239 ***
## tof -5.316e-04 1.140e-04 -4.663 3.11e-06 ***
## afint -1.955e-05 1.271e-05 -1.538 0.124068
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2750.5 on 8310 degrees of freedom
## Residual deviance: 2606.2 on 8305 degrees of freedom
## AIC: 2618.2
##
## Number of Fisher Scoring iterations: 7
summary(fit3)
##
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + rfint,
## family = binomial, data = new_book)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7739 -0.3387 -0.2696 -0.1790 3.5613
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.465e+00 1.016e-01 -24.277 < 2e-16 ***
## amount -2.686e-04 2.440e-04 -1.101 0.270971
## recency -1.339e-03 2.643e-04 -5.066 4.05e-07 ***
## frequency 4.224e-02 1.280e-02 3.301 0.000964 ***
## tof -4.350e-04 9.757e-05 -4.459 8.25e-06 ***
## rfint 1.238e-05 3.560e-05 0.348 0.727969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2750.5 on 8310 degrees of freedom
## Residual deviance: 2609.5 on 8305 degrees of freedom
## AIC: 2621.5
##
## Number of Fisher Scoring iterations: 7
summary(fit4)
##
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + afint +
## rfint, family = binomial, data = new_book)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6881 -0.3399 -0.2673 -0.1782 3.5363
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.534e+00 1.101e-01 -23.023 < 2e-16 ***
## amount 2.212e-04 3.677e-04 0.602 0.547345
## recency -1.178e-03 2.806e-04 -4.198 2.70e-05 ***
## frequency 5.788e-02 1.651e-02 3.505 0.000456 ***
## tof -5.315e-04 1.140e-04 -4.660 3.16e-06 ***
## afint -1.958e-05 1.289e-05 -1.520 0.128576
## rfint -6.565e-07 3.750e-05 -0.018 0.986033
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2750.5 on 8310 degrees of freedom
## Residual deviance: 2606.2 on 8304 degrees of freedom
## AIC: 2620.2
##
## Number of Fisher Scoring iterations: 7
new_book$ordersPer <- new_book$frequency / new_book$tof
new_book$amountPer <- new_book$amount/ new_book$tof
new_book
fit5 = glm (after ~ amount + recency + frequency + tof + ordersPer + amountPer, family = binomial, data=new_book)
summary(fit5)
##
## Call:
## glm(formula = after ~ amount + recency + frequency + tof + ordersPer +
## amountPer, family = binomial, data = new_book)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2880 -0.2921 -0.2553 -0.1922 3.3033
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1043286 0.1236046 -25.115 < 2e-16 ***
## amount -0.0004797 0.0002585 -1.856 0.0635 .
## recency -0.0009004 0.0002023 -4.451 8.57e-06 ***
## frequency 0.0492773 0.0121759 4.047 5.18e-05 ***
## tof -0.0001787 0.0001018 -1.756 0.0791 .
## ordersPer 3.5928284 2.7391340 1.312 0.1896
## amountPer 0.2720166 0.1001442 2.716 0.0066 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2443.2 on 8223 degrees of freedom
## Residual deviance: 2337.1 on 8217 degrees of freedom
## (87 observations deleted due to missingness)
## AIC: 2351.1
##
## Number of Fisher Scoring iterations: 7
merged.data1<- merge(new_book, train, by="id")
merged.data1
fit6 = glm (after ~ recency + frequency + ordersPer , family = binomial, data=new_book)
summary(fit6)
##
## Call:
## glm(formula = after ~ recency + frequency + ordersPer, family = binomial,
## data = new_book)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6138 -0.2968 -0.2609 -0.1944 3.3160
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1478229 0.1077500 -29.214 < 2e-16 ***
## recency -0.0011000 0.0001874 -5.869 4.4e-09 ***
## frequency 0.0226879 0.0066679 3.403 0.000668 ***
## ordersPer 8.2243365 2.4069902 3.417 0.000633 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2443.2 on 8223 degrees of freedom
## Residual deviance: 2349.8 on 8220 degrees of freedom
## (87 observations deleted due to missingness)
## AIC: 2357.8
##
## Number of Fisher Scoring iterations: 7
merged <- merge(book, test, by="id")
merged <-subset(merged, select = c(id, logtargamt.x, recency, frequency, amount, tof))
merged$ordersPer <- merged$frequency / merged$tof
merged$amountPer <- merged$amount/ merged$tof
train data for the regression
test data for predictions
myglm <- glm(after~ recency + frequency + ordersPer , data=merged.data1, family = "binomial")
score <- predict(myglm, newdata = merged, type = "response")
length(score)
## [1] 25402
merged.data1
^ which is the length of the test data.